#This script calculates the cosine similarity scores for the synthetic articles
#It does so using pre-trained embeddings and transformation matrices
library(quanteda)
library(conText)
library(dplyr)
library(text2vec)
library(data.table)
library(tidytext)
source("utils.R")

set.seed(123L)

data <- read.csv("data/synthetic/synthetic_articles_all_gpt-4o.csv")
terms <- read.csv("data/synthetic/oppsup_synthetic_terms.csv")
# Define the list of countries and their codes
countries <- unique(terms$langcode)

language_codes <- c(
  English = "en",
  French = "fr",
  Spanish = "es",
  Russian = "ru",
  Mandarin = "zh-CN",
  Arabic = "ar",
  Japanese = "ja",
  Korean = "ko"
)

# Create grouping variable
data <- data %>%
  group_by(Language) %>%  # Group by country/language code
  mutate(
    article_index = row_number(),  # Create an article index
    yearwk = ceiling(article_index / 5)  # Group every 50 articles
  ) %>%
  ungroup()  # Ungroup for further processing

# Add the new 'langcode' column to the 'data' data frame
data <- data %>%
  mutate(langcode = language_codes[Language])

# Define the embedding dimensionality for each language code
embedding_dims <- c("ar" = 15, "en" = 50, "fr" = 25, "es" = 25, "ja" = 10, "ko" = 15, "ru" = 25, "zh-CN" = 15)

# Function to get cosine similarities
process_cos_sim <- function(country_code) {
  # Subset the terms and data for the language
  term <- terms %>% filter(langcode == country_code)
  data_subset <- data %>% filter(langcode == country_code)
  
  # Define the support and opposition words for the language
  first_synth <- term$oppword
  second_synth <- term$supword
  
  # Get the dimensionality for the embedding files
  dimension <- embedding_dims[country_code]
  
  # Construct the file paths dynamically
  local_transform_path <- paste0("data/pretrained_embedding/alcembeddings/gloVe_", country_code, "/glove_transform_", country_code, "wiki_", dimension, ".rds")
  local_glove_path <- paste0("data/pretrained_embedding/alcembeddings/gloVe_", country_code, "/glove_vectors_", country_code, "wiki.txt")
  
  # Check if files exist before reading
  if (file.exists(local_transform_path) & file.exists(local_glove_path)) {
    local_transform <- as.matrix(readRDS(local_transform_path))
    
    # Read the full glove data
    glove_data <- fread(local_glove_path, header = FALSE)
    
    # Split the words and vectors
    glove_words <- glove_data[, V1]
    glove_vectors <- glove_data[, -1, with = FALSE]
    
    # Convert vectors to matrix and set row names
    glove_vectors <- as.matrix(glove_vectors)
    rownames(glove_vectors) <- glove_words
    
    # Now glove_vectors has words as row names
    local_glove <- glove_vectors
  } else {
    stop(paste("Embedding files for", country_code, "do not exist."))
  }
  
  data_subset <- corpus(data_subset, text_field = "Article")
  
  # Process the cosine similarities
  cos_simsdf_all <- get_similarity_scores(
    x = data_subset, 
    target = "POLITFIG",
    first_vec = first_synth, 
    second_vec = second_synth, 
    pre_trained = local_glove,
    transform_matrix = local_transform,
    window = 12L,
    group_var = "yearwk",
    norm = "l2"
  )
  
  # Return results
  cos_simsdf_all$country_code <- country_code  # This line adds the country code to the results
  return(cos_simsdf_all)
}

# Loop through countries and apply the process_cos_sim function
results_list <- lapply(countries, process_cos_sim)
if (!dir.exists("data/output/cos_sims_synthetic")) {
  dir.create("data/output/cos_sims_synthetic_gpt-4o", recursive = TRUE)
}
saveRDS(results_list,"data/output/cos_sims_synthetic_gpt-4o/cos_sims_synthetic_all.rds")
saveRDS(results_list,"data/output/cos_sims_synthetic_gpt-4o/cos_sims_synthetic_all.rds")

# Save the results
for (i in seq_along(countries)) {
  dir_name <- paste0("data/output/cos_sims_synthetic_gpt-4o/", countries[i])
  if (!dir.exists(dir_name)) {
    dir.create(dir_name, recursive = TRUE)
  }
  
  saveRDS(results_list[[i]], paste0(dir_name, "/", "cos_simsdf_all.rds"))
}

# Look at high-scoring articles for English

endat <- data %>%
  filter(Language == "English")

enterm <- terms %>%
  filter(langcode== "en")

# Define the support and opposition words for the language
first_synth <- enterm$oppword
second_synth <- enterm$supword

# Add an identifier for each article (so we can keep track later)
endat <- endat %>%
  mutate(doc_id = row_number())

# Use tidytext::unnest_tokens to split by sentences
endat_sentences <- endat %>%
  unnest_tokens(output = "sentence",
                input = "Article",
                token = "sentences")

dimension <- 50
local_transform_path <- paste0("data/pretrained_embedding/alcembeddings/gloVe_en/glove_transform_enwiki_", dimension, ".rds")
local_glove_path <- paste0("data/pretrained_embedding/alcembeddings/gloVe_en/glove_vectors_enwiki.txt")

# Read in the transform matrix
local_transform <- as.matrix(readRDS(local_transform_path))

# Read the GloVe vectors
glove_data <- fread(local_glove_path, header = FALSE)
glove_words <- glove_data[, V1]
glove_vectors <- glove_data[, -1, with = FALSE]
glove_vectors <- as.matrix(glove_vectors)
rownames(glove_vectors) <- glove_words

# Final pre-trained object
local_glove <- glove_vectors

endat_sentences$doc_id_unique <- paste0(endat_sentences$doc_id, "_", seq.int(nrow(endat_sentences)))

corpus_sentences <- corpus(
  endat_sentences,
  docid_field = "doc_id_unique",
  text_field  = "sentence"
)

options(scipen = 999)

cos_simsdf_sentences <- get_similarity_scores(
  x               = corpus_sentences, 
  target          = "POLITFIG",     
  first_vec       = first_synth,     
  second_vec      = second_synth,    
  pre_trained     = local_glove,
  transform_matrix= local_transform,
  window          = 12L,             
  group_var       = "doc_id",     
  norm            = "l2"
)

endat_scored <- endat %>%
  left_join(
    cos_simsdf_sentences %>% mutate(group = as.integer(as.character(group))),
    by = c("Index" = "group")
  )

write_csv(endat_scored, "data/synthetic/endat_gpt-4o_scored.csv")
